{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "import polars as pl\n",
        "import numpy as np\n",
        "from datetime import datetime, timedelta\n",
        "import io\n",
        "\n",
        "try:\n",
        "    import polars as pl\n",
        "except ImportError:\n",
        "    import subprocess\n",
        "    subprocess.run([\"pip\", \"install\", \"polars\"], check=True)\n",
        "    import polars as pl\n",
        "\n",
        "print(\"🚀 Advanced Polars Analytics Pipeline\")\n",
        "print(\"=\" * 50)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Y3zSNcMrF2-X",
        "outputId": "6155d76b-3630-4a64-aa36-457edf7fb855"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "🚀 Advanced Polars Analytics Pipeline\n",
            "==================================================\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "np.random.seed(42)\n",
        "n_records = 100000\n",
        "dates = [datetime(2020, 1, 1) + timedelta(days=i//100) for i in range(n_records)]\n",
        "tickers = np.random.choice(['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN'], n_records)\n",
        "\n",
        "# Create complex synthetic dataset\n",
        "data = {\n",
        "    'timestamp': dates,\n",
        "    'ticker': tickers,\n",
        "    'price': np.random.lognormal(4, 0.3, n_records),\n",
        "    'volume': np.random.exponential(1000000, n_records).astype(int),\n",
        "    'bid_ask_spread': np.random.exponential(0.01, n_records),\n",
        "    'market_cap': np.random.lognormal(25, 1, n_records),\n",
        "    'sector': np.random.choice(['Tech', 'Finance', 'Healthcare', 'Energy'], n_records)\n",
        "}\n",
        "\n",
        "print(f\"📊 Generated {n_records:,} synthetic financial records\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "K7jA3DiiF6sD",
        "outputId": "8f4202ff-b2e0-4525-eb64-49a9b72430e0"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "📊 Generated 100,000 synthetic financial records\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "lf = pl.LazyFrame(data)\n",
        "\n",
        "result = (\n",
        "    lf\n",
        "    .with_columns([\n",
        "        pl.col('timestamp').dt.year().alias('year'),\n",
        "        pl.col('timestamp').dt.month().alias('month'),\n",
        "        pl.col('timestamp').dt.weekday().alias('weekday'),\n",
        "        pl.col('timestamp').dt.quarter().alias('quarter')\n",
        "    ])\n",
        "\n",
        "    .with_columns([\n",
        "        pl.col('price').rolling_mean(20).over('ticker').alias('sma_20'),\n",
        "        pl.col('price').rolling_std(20).over('ticker').alias('volatility_20'),\n",
        "\n",
        "        pl.col('price').ewm_mean(span=12).over('ticker').alias('ema_12'),\n",
        "\n",
        "        pl.col('price').diff().alias('price_diff'),\n",
        "\n",
        "        (pl.col('volume') * pl.col('price')).alias('dollar_volume')\n",
        "    ])\n",
        "\n",
        "    .with_columns([\n",
        "        pl.col('price_diff').clip(0, None).rolling_mean(14).over('ticker').alias('rsi_up'),\n",
        "        pl.col('price_diff').abs().rolling_mean(14).over('ticker').alias('rsi_down'),\n",
        "\n",
        "        (pl.col('price') - pl.col('sma_20')).alias('bb_position')\n",
        "    ])\n",
        "\n",
        "    .with_columns([\n",
        "        (100 - (100 / (1 + pl.col('rsi_up') / pl.col('rsi_down')))).alias('rsi')\n",
        "    ])\n",
        "\n",
        "    .filter(\n",
        "        (pl.col('price') > 10) &\n",
        "        (pl.col('volume') > 100000) &\n",
        "        (pl.col('sma_20').is_not_null())\n",
        "    )\n",
        "\n",
        "    .group_by(['ticker', 'year', 'quarter'])\n",
        "    .agg([\n",
        "        pl.col('price').mean().alias('avg_price'),\n",
        "        pl.col('price').std().alias('price_volatility'),\n",
        "        pl.col('price').min().alias('min_price'),\n",
        "        pl.col('price').max().alias('max_price'),\n",
        "        pl.col('price').quantile(0.5).alias('median_price'),\n",
        "\n",
        "        pl.col('volume').sum().alias('total_volume'),\n",
        "        pl.col('dollar_volume').sum().alias('total_dollar_volume'),\n",
        "\n",
        "        pl.col('rsi').filter(pl.col('rsi').is_not_null()).mean().alias('avg_rsi'),\n",
        "        pl.col('volatility_20').mean().alias('avg_volatility'),\n",
        "        pl.col('bb_position').std().alias('bollinger_deviation'),\n",
        "\n",
        "        pl.len().alias('trading_days'),\n",
        "        pl.col('sector').n_unique().alias('sectors_count'),\n",
        "\n",
        "        (pl.col('price') > pl.col('sma_20')).mean().alias('above_sma_ratio'),\n",
        "\n",
        "        ((pl.col('price').max() - pl.col('price').min()) / pl.col('price').min())\n",
        "          .alias('price_range_pct')\n",
        "    ])\n",
        "\n",
        "    .with_columns([\n",
        "        pl.col('total_dollar_volume').rank(method='ordinal', descending=True).alias('volume_rank'),\n",
        "        pl.col('price_volatility').rank(method='ordinal', descending=True).alias('volatility_rank')\n",
        "    ])\n",
        "\n",
        "    .filter(pl.col('trading_days') >= 10)\n",
        "    .sort(['ticker', 'year', 'quarter'])\n",
        ")"
      ],
      "metadata": {
        "id": "fbAKWKXIF7UM"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df = result.collect()\n",
        "print(f\"\\n📈 Analysis Results: {df.height:,} aggregated records\")\n",
        "print(\"\\nTop 10 High-Volume Quarters:\")\n",
        "print(df.sort('total_dollar_volume', descending=True).head(10).to_pandas())\n",
        "\n",
        "print(\"\\n🔍 Advanced Analytics:\")\n",
        "\n",
        "pivot_analysis = (\n",
        "    df.group_by('ticker')\n",
        "    .agg([\n",
        "        pl.col('avg_price').mean().alias('overall_avg_price'),\n",
        "        pl.col('price_volatility').mean().alias('overall_volatility'),\n",
        "        pl.col('total_dollar_volume').sum().alias('lifetime_volume'),\n",
        "        pl.col('above_sma_ratio').mean().alias('momentum_score'),\n",
        "        pl.col('price_range_pct').mean().alias('avg_range_pct')\n",
        "    ])\n",
        "    .with_columns([\n",
        "        (pl.col('overall_avg_price') / pl.col('overall_volatility')).alias('risk_adj_score'),\n",
        "\n",
        "        (pl.col('momentum_score') * 0.4 +\n",
        "         pl.col('avg_range_pct') * 0.3 +\n",
        "         (pl.col('lifetime_volume') / pl.col('lifetime_volume').max()) * 0.3)\n",
        "         .alias('composite_score')\n",
        "    ])\n",
        "    .sort('composite_score', descending=True)\n",
        ")\n",
        "\n",
        "print(\"\\n🏆 Ticker Performance Ranking:\")\n",
        "print(pivot_analysis.to_pandas())"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "HuQ92CD0GQGm",
        "outputId": "b124ebe6-f880-43b7-9595-2a490089abb0"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "📈 Analysis Results: 55 aggregated records\n",
            "\n",
            "Top 10 High-Volume Quarters:\n",
            "  ticker  year  quarter  avg_price  price_volatility  min_price   max_price  \\\n",
            "0  GOOGL  2021        1  57.202767         17.959219  21.381671  178.847677   \n",
            "1  GOOGL  2021        4  58.237738         18.053261  21.166928  141.119185   \n",
            "2   TSLA  2020        3  56.841917         17.185146  21.578360  147.829411   \n",
            "3   AMZN  2020        2  57.284240         17.566920  17.889955  145.269024   \n",
            "4   AMZN  2021        2  56.966529         17.507191  20.161137  157.495422   \n",
            "5   AMZN  2021        3  56.952561         17.064709  22.396973  137.289107   \n",
            "6   AMZN  2020        4  56.963474         17.984251  18.077893  154.224561   \n",
            "7  GOOGL  2020        4  57.423558         17.367769  22.837007  132.253255   \n",
            "8  GOOGL  2020        1  57.198658         17.192811  19.157802  160.768177   \n",
            "9   TSLA  2021        1  56.670237         17.413646  19.237795  151.668559   \n",
            "\n",
            "   median_price  total_volume  total_dollar_volume    avg_rsi  avg_volatility  \\\n",
            "0     54.579006    1963333586         1.122779e+11  32.234331       17.463582   \n",
            "1     55.526666    1916292518         1.110893e+11  33.147154       17.565799   \n",
            "2     54.134551    1917517787         1.098291e+11  31.932448       16.740977   \n",
            "3     55.034934    1915167590         1.096681e+11  32.680702       17.323514   \n",
            "4     54.784787    1930435851         1.095019e+11  33.074713       17.144568   \n",
            "5     54.722989    1906874542         1.087995e+11  32.049963       16.716027   \n",
            "6     54.060843    1893495094         1.081423e+11  32.340386       17.437918   \n",
            "7     55.047742    1863696264         1.075745e+11  32.925234       17.052431   \n",
            "8     54.806317    1877763146         1.071686e+11  32.570668       17.032767   \n",
            "9     53.879552    1866635200         1.068939e+11  31.439947       17.007282   \n",
            "\n",
            "   bollinger_deviation  trading_days  sectors_count  above_sma_ratio  \\\n",
            "0            17.515371          1706              4         0.446659   \n",
            "1            17.573425          1686              4         0.441281   \n",
            "2            16.859226          1728              4         0.446759   \n",
            "3            17.228960          1676              4         0.446301   \n",
            "4            17.095446          1683              4         0.453357   \n",
            "5            16.650819          1674              4         0.441458   \n",
            "6            17.517374          1754              4         0.428164   \n",
            "7            16.901643          1642              4         0.447016   \n",
            "8            16.719488          1678              4         0.448749   \n",
            "9            16.985419          1666              4         0.439976   \n",
            "\n",
            "   price_range_pct  volume_rank  volatility_rank  \n",
            "0         7.364532            1               11  \n",
            "1         5.666966            2                6  \n",
            "2         5.850818            3               40  \n",
            "3         7.120144            4               21  \n",
            "4         6.811832            5               27  \n",
            "5         5.129806            6               48  \n",
            "6         7.531114            7               10  \n",
            "7         4.791182            8               33  \n",
            "8         7.391786            9               39  \n",
            "9         6.883885           10               31  \n",
            "\n",
            "🔍 Advanced Analytics:\n",
            "\n",
            "🏆 Ticker Performance Ranking:\n",
            "  ticker  overall_avg_price  overall_volatility  lifetime_volume  \\\n",
            "0  GOOGL          57.167330           17.583427     1.158188e+12   \n",
            "1   AMZN          57.042317           17.570043     1.146325e+12   \n",
            "2   TSLA          57.115181           17.368533     1.141268e+12   \n",
            "3   MSFT          57.180817           17.431685     1.120276e+12   \n",
            "4   AAPL          57.082754           17.466113     1.129605e+12   \n",
            "\n",
            "   momentum_score  avg_range_pct  risk_adj_score  composite_score  \n",
            "0        0.446520       6.759596        3.251205         2.506487  \n",
            "1        0.442789       6.729687        3.246567         2.492949  \n",
            "2        0.442824       6.381269        3.288429         2.387127  \n",
            "3        0.447476       6.237898        3.280280         2.340540  \n",
            "4        0.442102       6.176378        3.268200         2.322351  \n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "a4SKoJsyFALn",
        "outputId": "4aa1883f-35fc-469e-9ea8-d90bde9c6997"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "🔄 SQL Interface Demo:\n",
            "shape: (5, 5)\n",
            "┌────────┬────────────┬────────────────────────┬──────────────┬──────────────────┐\n",
            "│ ticker ┆ mean_price ┆ volatility_consistency ┆ total_volume ┆ quarters_tracked │\n",
            "│ ---    ┆ ---        ┆ ---                    ┆ ---          ┆ ---              │\n",
            "│ str    ┆ f64        ┆ f64                    ┆ f64          ┆ u32              │\n",
            "╞════════╪════════════╪════════════════════════╪══════════════╪══════════════════╡\n",
            "│ GOOGL  ┆ 57.196227  ┆ 0.448066               ┆ 7.3922e11    ┆ 7                │\n",
            "│ AMZN   ┆ 56.951047  ┆ 0.266619               ┆ 7.2531e11    ┆ 7                │\n",
            "│ TSLA   ┆ 57.172448  ┆ 0.337545               ┆ 7.2022e11    ┆ 7                │\n",
            "│ MSFT   ┆ 57.258385  ┆ 0.447526               ┆ 7.1621e11    ┆ 7                │\n",
            "│ AAPL   ┆ 56.988095  ┆ 0.47604                ┆ 7.1573e11    ┆ 7                │\n",
            "└────────┴────────────┴────────────────────────┴──────────────┴──────────────────┘\n",
            "\n",
            "⚡ Performance Metrics:\n",
            "   • Lazy evaluation optimizations applied\n",
            "   • 100,000 records processed efficiently\n",
            "   • Memory-efficient columnar operations\n",
            "   • Zero-copy operations where possible\n",
            "\n",
            "💾 Export Options:\n",
            "   • Parquet (high compression): df.write_parquet('data.parquet')\n",
            "   • Delta Lake: df.write_delta('delta_table')\n",
            "   • JSON streaming: df.write_ndjson('data.jsonl')\n",
            "   • Apache Arrow: df.to_arrow()\n",
            "\n",
            "✅ Advanced Polars pipeline completed successfully!\n",
            "🎯 Demonstrated: Lazy evaluation, complex expressions, window functions,\n",
            "   SQL interface, advanced aggregations, and high-performance analytics\n"
          ]
        }
      ],
      "source": [
        "print(\"\\n🔄 SQL Interface Demo:\")\n",
        "pl.Config.set_tbl_rows(5)\n",
        "\n",
        "sql_result = pl.sql(\"\"\"\n",
        "    SELECT\n",
        "        ticker,\n",
        "        AVG(avg_price) as mean_price,\n",
        "        STDDEV(price_volatility) as volatility_consistency,\n",
        "        SUM(total_dollar_volume) as total_volume,\n",
        "        COUNT(*) as quarters_tracked\n",
        "    FROM df\n",
        "    WHERE year >= 2021\n",
        "    GROUP BY ticker\n",
        "    ORDER BY total_volume DESC\n",
        "\"\"\", eager=True)\n",
        "\n",
        "print(sql_result)\n",
        "\n",
        "print(f\"\\n⚡ Performance Metrics:\")\n",
        "print(f\"   • Lazy evaluation optimizations applied\")\n",
        "print(f\"   • {n_records:,} records processed efficiently\")\n",
        "print(f\"   • Memory-efficient columnar operations\")\n",
        "print(f\"   • Zero-copy operations where possible\")\n",
        "\n",
        "print(f\"\\n💾 Export Options:\")\n",
        "print(\"   • Parquet (high compression): df.write_parquet('data.parquet')\")\n",
        "print(\"   • Delta Lake: df.write_delta('delta_table')\")\n",
        "print(\"   • JSON streaming: df.write_ndjson('data.jsonl')\")\n",
        "print(\"   • Apache Arrow: df.to_arrow()\")\n",
        "\n",
        "print(\"\\n✅ Advanced Polars pipeline completed successfully!\")\n",
        "print(\"🎯 Demonstrated: Lazy evaluation, complex expressions, window functions,\")\n",
        "print(\"   SQL interface, advanced aggregations, and high-performance analytics\")"
      ]
    }
  ]
}